import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import pandas as pd
import plotly.offline as offline
offline.init_notebook_mode()
import cufflinks as cf
cf.go_offline()
plt.style.use('fivethirtyeight')
app_train=pd.read_csv(r'C:\Users\Qingguo Wu\Desktop\Loan_risk_project\信用风险数据集\application_train.csv')
app_test = pd.read_csv(r'C:\Users\Qingguo Wu\Desktop\Loan_risk_project\信用风险数据集\application_test.csv')
app_train.head()
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
#定义缺失值检测函数
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# Print some summary information
print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")
# Return the dataframe with missing information
return mis_val_table_ren_columns
missing_values = missing_values_table(app_train)
missing_values.head(20)
Your selected dataframe has 122 columns. There are 67 columns that have missing values.
| Missing Values | % of Total Values | |
|---|---|---|
| COMMONAREA_MEDI | 214865 | 69.9 |
| COMMONAREA_AVG | 214865 | 69.9 |
| COMMONAREA_MODE | 214865 | 69.9 |
| NONLIVINGAPARTMENTS_MEDI | 213514 | 69.4 |
| NONLIVINGAPARTMENTS_MODE | 213514 | 69.4 |
| NONLIVINGAPARTMENTS_AVG | 213514 | 69.4 |
| FONDKAPREMONT_MODE | 210295 | 68.4 |
| LIVINGAPARTMENTS_MODE | 210199 | 68.4 |
| LIVINGAPARTMENTS_MEDI | 210199 | 68.4 |
| LIVINGAPARTMENTS_AVG | 210199 | 68.4 |
| FLOORSMIN_MODE | 208642 | 67.8 |
| FLOORSMIN_MEDI | 208642 | 67.8 |
| FLOORSMIN_AVG | 208642 | 67.8 |
| YEARS_BUILD_MODE | 204488 | 66.5 |
| YEARS_BUILD_MEDI | 204488 | 66.5 |
| YEARS_BUILD_AVG | 204488 | 66.5 |
| OWN_CAR_AGE | 202929 | 66.0 |
| LANDAREA_AVG | 182590 | 59.4 |
| LANDAREA_MEDI | 182590 | 59.4 |
| LANDAREA_MODE | 182590 | 59.4 |
数据一共有122列,有67列存在缺失情况,最高缺失值的列缺失度为69.9%
对主训练集进行异常值的探索,采取描述统计的方法
(app_train['DAYS_BIRTH'] / -365).describe()
count 307511.000000 mean 43.936973 std 11.956133 min 20.517808 25% 34.008219 50% 43.150685 75% 53.923288 max 69.120548 Name: DAYS_BIRTH, dtype: float64
查看用户年龄的数据分布情况, 最大年龄69岁,最小年龄20岁,没有异常
(app_train['DAYS_EMPLOYED']/-365).describe()
count 307511.000000 mean -174.835742 std 387.056895 min -1000.665753 25% 0.791781 50% 3.323288 75% 7.561644 max 49.073973 Name: DAYS_EMPLOYED, dtype: float64
查看用户的工作时间分布情况, 这里的-1000年明显是一个异常数据
app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram')
<AxesSubplot:title={'center':'Days Employment Histogram'}, ylabel='Frequency'>
查看用户受工作时间的数据分布情况,发现所有的异常值都是一个值,365243,它可能是代表缺失值,所以将这个异常值用空值去替换
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
(app_train['DAYS_EMPLOYED']/-365).plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment')
Text(0.5, 0, 'Days Employment')
查看违约用户和非违约用户的特征分布情况
def plot_stats(feature,label_rotation=False,horizontal_layout=True):
temp = app_train[feature].value_counts()
df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})
# Calculate the percentage of target=1 per category value
cat_perc = app_train[[feature, 'TARGET']].groupby([feature],as_index=False).mean()
cat_perc.sort_values(by='TARGET', ascending=False, inplace=True)
if(horizontal_layout):
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
else:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12,14))
sns.set_color_codes("pastel")
s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)
if(label_rotation):
s.set_xticklabels(s.get_xticklabels(),rotation=90)
s = sns.barplot(ax=ax2, x = feature, y='TARGET', order=cat_perc[feature], data=cat_perc)
if(label_rotation):
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.ylabel('Percent of target with value 1 [%]', fontsize=10)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.show()
def plot_distribution(var):
i = 0
t1 = app_train.loc[app_train['TARGET'] != 0]
t0 = app_train.loc[app_train['TARGET'] == 0]
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(2,2,figsize=(12,12))
for feature in var:
i += 1
plt.subplot(2,2,i)
sns.kdeplot(t1[feature], bw=0.5,label="TARGET = 1")
sns.kdeplot(t0[feature], bw=0.5,label="TARGET = 0")
plt.ylabel('Density plot', fontsize=12)
plt.xlabel(feature, fontsize=12)
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show()
plot_stats('CODE_GENDER')
查看男性和女性用户的违约率情况,发现男性用户违约率更高,男性用户违约率为10%,女性为7%
plt.figure(figsize = (10, 8))
# KDE plot of loans that were repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / -365, label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / -365, label = 'target == 1')
# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages')
Text(0.5, 1.0, 'Distribution of Ages')
违约用户年轻用户分布更多,所以推断的结论是用户年龄越小,违约的可能性越大
age_data = app_train[['TARGET', 'DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / -365
# Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
age_groups = age_data.groupby('YEARS_BINNED').mean()
plt.figure(figsize = (8, 8))
# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str), 100 * age_groups['TARGET'])
# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group')
Text(0.5, 1.0, 'Failure to Repay by Age Group')
进一步观察观察不同年龄段用户的违约概率,发现确实是用户年龄越小,违约的可能性越高
plot_stats('NAME_CONTRACT_TYPE')
查看现金贷款和流动资金循坏贷款,现金贷款的违约率更高
plot_stats('FLAG_OWN_CAR')
plot_stats('FLAG_OWN_REALTY')
用户有没有房和车对违约率的影响,发现没有车和房的人违约率更高,但相差并不是很大
plot_stats('NAME_FAMILY_STATUS',True, True)
从家庭情况看,申请的用户大多已经结婚,单身和世俗结婚的违约率较高,寡居的违约率最低
plot_stats('CNT_CHILDREN')
子女信息,大部分申请者没有孩子或孩子在3个以下,孩子越多的家庭违约率越高,发现对于有9、11个孩子的家庭违约率达到了100%
plot_stats('NAME_INCOME_TYPE',False,False)
根据申请者的收入类型区分,可以发现休产假和没有工作的人违约率较高,在35%以上,对于这两类人群放款需较为谨慎
plot_stats('OCCUPATION_TYPE',True, False)
从职业来看,越相对收入较低、不稳定的职业违约率越高,比如低廉劳动力、司机、理发师,而像会计、高科技员工等具有稳定高收入的职业违约率就较低
plot_stats('NAME_EDUCATION_TYPE',True)
贷款申请人受教育程度大多为中学,学历越低越容易违约
通过对特征的一些理解,尝试做出一些新的特征
CREDIT_INCOME_PERCENT: 贷款金额/客户收入,预期是这个比值越大,说明贷款金额大于用户的收入,用户违约的可能性就越大
ANNUITY_INCOME_PERCENT: 贷款的每年还款金额/客户收入,逻辑与上面一致
CREDIT_TERM: 贷款的每年还款金额/贷款金额,贷款的还款周期,猜测还款周期短的贷款,用户的短期压力可能会比较大,违约概率高
DAYS_EMPLOYED_PERCENT: 用户工作时间/用户年龄
INCOME_PER_CHILD:客户收入/孩子数量,客户的收入平均到每个孩子身上,同样的收入,如果这个人的家庭很大,孩子很多,那么他的负担可能比较重,违约的可能性可能更高
HAS_HOUSE_INFORMATION : 根据客户是否有缺失房屋信息设计一个二分类特征,如果未缺失的话是1,缺失的是0
app_train_domain = app_train.copy()
app_test_domain = app_test.copy()
app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']
app_train_domain['INCOME_PER_CHILD'] = app_train_domain['AMT_INCOME_TOTAL'] / app_train_domain['CNT_CHILDREN']
app_train_domain['HAS_HOUSE_INFORMATION'] = app_train_domain['COMMONAREA_MEDI'].apply(lambda x:1 if x>0 else 0)
plt.figure(figsize = (12, 20))
# iterate through the new features
for i, feature in enumerate(['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT','INCOME_PER_CHILD']):
# create a new subplot for each source
plt.subplot(5, 1, i + 1)
# plot repaid loans
sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 0, feature], label = 'target == 0')
# plot loans that were not repaid
sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 1, feature], label = 'target == 1')
# Label the plots
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature); plt.ylabel('Density');
plt.tight_layout(h_pad = 2.5)
放到模型中再看一下效果
def plot_stats(feature,label_rotation=False,horizontal_layout=True):
temp = app_train_domain[feature].value_counts()
df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})
# Calculate the percentage of target=1 per category value
cat_perc = app_train_domain[[feature, 'TARGET']].groupby([feature],as_index=False).mean()
cat_perc.sort_values(by='TARGET', ascending=False, inplace=True)
if(horizontal_layout):
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
else:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12,14))
sns.set_color_codes("pastel")
s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)
if(label_rotation):
s.set_xticklabels(s.get_xticklabels(),rotation=90)
s = sns.barplot(ax=ax2, x = feature, y='TARGET', order=cat_perc[feature], data=cat_perc)
if(label_rotation):
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.ylabel('Percent of target with value 1 [%]', fontsize=10)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.show()
plot_stats('HAS_HOUSE_INFORMATION',True)
缺失房屋信息的用户违约概率要明显高于未缺失用户
对测试集做同样的处理
app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['CREDIT_TERM'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']
app_test_domain['INCOME_PER_CHILD'] = app_test_domain['AMT_INCOME_TOTAL'] / app_test_domain['CNT_CHILDREN']
app_test_domain['HAS_HOUSE_INFORMATION'] = app_test_domain['COMMONAREA_MEDI'].apply(lambda x:1 if x>0 else 0)
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
def model(features, test_features, encoding = 'ohe', n_folds = 5):
#Extract the ids
train_ids = features['SK_ID_CURR']
test_ids = test_features['SK_ID_CURR']
# Extract the labels for training
labels = features['TARGET']
# Remove the ids and target
features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
test_features = test_features.drop(columns = ['SK_ID_CURR'])
# One Hot Encoding
if encoding == 'ohe':
features = pd.get_dummies(features)
test_features = pd.get_dummies(test_features)
# Align the dataframes by the columns
features, test_features = features.align(test_features, join = 'inner', axis = 1)
# No categorical indices to record
cat_indices = 'auto'
# Integer label encoding
elif encoding == 'le':
# Create a label encoder
label_encoder = LabelEncoder()
# List for storing categorical indices
cat_indices = []
# Iterate through each column
for i, col in enumerate(features):
if features[col].dtype == 'object':
# Map the categorical features to integers
features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))
# Record the categorical indices
cat_indices.append(i)
# Catch error if label encoding scheme is not valid
else:
raise ValueError("Encoding must be either 'ohe' or 'le'")
print('Training Data Shape: ', features.shape)
print('Testing Data Shape: ', test_features.shape)
# Extract feature names
feature_names = list(features.columns)
# Convert to np arrays
features = np.array(features)
test_features = np.array(test_features)
# Create the kfold object
k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
# Empty array for feature importances
feature_importance_values = np.zeros(len(feature_names))
# Empty array for test predictions
test_predictions = np.zeros(test_features.shape[0])
# Empty array for out of fold validation predictions
out_of_fold = np.zeros(features.shape[0])
# Lists for recording validation and training scores
valid_scores = []
train_scores = []
# Iterate through each fold
for train_indices, valid_indices in k_fold.split(features):
# Training data for the fold
train_features, train_labels = features[train_indices], labels[train_indices]
# Validation data for the fold
valid_features, valid_labels = features[valid_indices], labels[valid_indices]
# Create the model
model = lgb.LGBMClassifier(n_estimators=1000, objective = 'binary',
class_weight = 'balanced', learning_rate = 0.05,
reg_alpha = 0.1, reg_lambda = 0.1,
subsample = 0.8, n_jobs = -1, random_state = 50)
# Train the model
model.fit(train_features, train_labels, eval_metric = 'auc',
eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
eval_names = ['valid', 'train'], categorical_feature = cat_indices,
early_stopping_rounds = 100, verbose = 200)
# Record the best iteration
best_iteration = model.best_iteration_
# Record the feature importances
feature_importance_values += model.feature_importances_ / k_fold.n_splits
# Make predictions
test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
# Record the out of fold predictions
out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
# Record the best score
valid_score = model.best_score_['valid']['auc']
train_score = model.best_score_['train']['auc']
valid_scores.append(valid_score)
train_scores.append(train_score)
# Clean up memory
gc.enable()
del model, train_features, valid_features
gc.collect()
# Make the submission dataframe
submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
# Make the feature importance dataframe
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
# Overall validation score
valid_auc = roc_auc_score(labels, out_of_fold)
# Add the overall scores to the metrics
valid_scores.append(valid_auc)
train_scores.append(np.mean(train_scores))
# Needed for creating dataframe of validation scores
fold_names = list(range(n_folds))
fold_names.append('overall')
# Dataframe of validation scores
metrics = pd.DataFrame({'fold': fold_names,
'train': train_scores,
'valid': valid_scores})
return submission, feature_importances, metrics
submission, fi, metrics = model(app_train_domain, app_test_domain)
print('Baseline metrics')
print(metrics)
del app_train_domain,app_test_domain
gc.collect
Training Data Shape: (307511, 247)
Testing Data Shape: (48744, 247)
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.804421 train's binary_logloss: 0.541832 valid's auc: 0.76227 valid's binary_logloss: 0.557669
[400] train's auc: 0.833921 train's binary_logloss: 0.511411 valid's auc: 0.763074 valid's binary_logloss: 0.539011
Early stopping, best iteration is:
[418] train's auc: 0.836218 train's binary_logloss: 0.50891 valid's auc: 0.763194 valid's binary_logloss: 0.537474
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.804375 train's binary_logloss: 0.541874 valid's auc: 0.766396 valid's binary_logloss: 0.557693
Early stopping, best iteration is:
[247] train's auc: 0.812209 train's binary_logloss: 0.533819 valid's auc: 0.766563 valid's binary_logloss: 0.552859
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.803717 train's binary_logloss: 0.542765 valid's auc: 0.769687 valid's binary_logloss: 0.558004
Early stopping, best iteration is:
[229] train's auc: 0.808762 train's binary_logloss: 0.537685 valid's auc: 0.769938 valid's binary_logloss: 0.554773
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.804728 train's binary_logloss: 0.541835 valid's auc: 0.765429 valid's binary_logloss: 0.555999
Early stopping, best iteration is:
[298] train's auc: 0.821007 train's binary_logloss: 0.525363 valid's auc: 0.765922 valid's binary_logloss: 0.545861
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.804563 train's binary_logloss: 0.54162 valid's auc: 0.765072 valid's binary_logloss: 0.558914
Early stopping, best iteration is:
[237] train's auc: 0.810617 train's binary_logloss: 0.535513 valid's auc: 0.765252 valid's binary_logloss: 0.555296
Baseline metrics
fold train valid
0 0 0.836218 0.763194
1 1 0.812209 0.766563
2 2 0.808762 0.769938
3 3 0.821007 0.765922
4 4 0.810617 0.765252
5 overall 0.817763 0.766133
<function gc.collect(generation=2)>
通过lgb自带的函数查看特征的重要性
def plot_feature_importances(df):
# Sort features according to importance
df = df.sort_values('importance', ascending = False).reset_index()
# Normalize the feature importances to add up to one
df['importance_normalized'] = df['importance'] / df['importance'].sum()
# Make a horizontal bar chart of feature importances
plt.figure(figsize = (10, 6))
ax = plt.subplot()
# Need to reverse the index to plot most important on top
ax.barh(list(reversed(list(df.index[:15]))),
df['importance_normalized'].head(15),
align = 'center', edgecolor = 'k')
# Set the yticks and labels
ax.set_yticks(list(reversed(list(df.index[:15]))))
ax.set_yticklabels(df['feature'].head(15))
# Plot labeling
plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
plt.show()
return df
fi_sorted = plot_feature_importances(fi)
从这些辅助训练集中提取出一些信息把它们作为新的特征加入到主训练集中
bureau = pd.read_csv(r'C:\Users\Qingguo Wu\Desktop\Loan_risk_project\信用风险数据集\bureau.csv')
bureau.head()
| SK_ID_CURR | SK_ID_BUREAU | CREDIT_ACTIVE | CREDIT_CURRENCY | DAYS_CREDIT | CREDIT_DAY_OVERDUE | DAYS_CREDIT_ENDDATE | DAYS_ENDDATE_FACT | AMT_CREDIT_MAX_OVERDUE | CNT_CREDIT_PROLONG | AMT_CREDIT_SUM | AMT_CREDIT_SUM_DEBT | AMT_CREDIT_SUM_LIMIT | AMT_CREDIT_SUM_OVERDUE | CREDIT_TYPE | DAYS_CREDIT_UPDATE | AMT_ANNUITY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 215354 | 5714462 | Closed | currency 1 | -497 | 0 | -153.0 | -153.0 | NaN | 0 | 91323.0 | 0.0 | NaN | 0.0 | Consumer credit | -131 | NaN |
| 1 | 215354 | 5714463 | Active | currency 1 | -208 | 0 | 1075.0 | NaN | NaN | 0 | 225000.0 | 171342.0 | NaN | 0.0 | Credit card | -20 | NaN |
| 2 | 215354 | 5714464 | Active | currency 1 | -203 | 0 | 528.0 | NaN | NaN | 0 | 464323.5 | NaN | NaN | 0.0 | Consumer credit | -16 | NaN |
| 3 | 215354 | 5714465 | Active | currency 1 | -203 | 0 | NaN | NaN | NaN | 0 | 90000.0 | NaN | NaN | 0.0 | Credit card | -16 | NaN |
| 4 | 215354 | 5714466 | Active | currency 1 | -629 | 0 | 1197.0 | NaN | 77674.5 | 0 | 2700000.0 | NaN | NaN | 0.0 | Consumer credit | -21 | NaN |
针对每个贷款申请人计算他们在其他金融机构历史上的贷款数量
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()
| SK_ID_CURR | previous_loan_counts | |
|---|---|---|
| 0 | 100001 | 7 |
| 1 | 100002 | 8 |
| 2 | 100003 | 4 |
| 3 | 100004 | 2 |
| 4 | 100005 | 3 |
再把计算出来的统计特征和主训练集做left join
app_train = app_train.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
# Fill the missing values with 0
app_train['previous_loan_counts'] = app_train['previous_loan_counts'].fillna(0)
app_train.head()
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | previous_loan_counts | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 8.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
5 rows × 123 columns
print(app_train[app_train.TARGET==1]['previous_loan_counts'].describe())
print(app_train[app_train.TARGET==0]['previous_loan_counts'].describe())
count 24825.000000 mean 4.613092 std 4.732792 min 0.000000 25% 1.000000 50% 3.000000 75% 7.000000 max 55.000000 Name: previous_loan_counts, dtype: float64 count 282686.000000 mean 4.778464 std 4.474586 min 0.000000 25% 1.000000 50% 4.000000 75% 7.000000 max 116.000000 Name: previous_loan_counts, dtype: float64
查看违约和非违约用户previous_loan_counts的统计属性发现,虽然非违约用户的平均贷款申请数量要略多于违约用户,但差异很小
定义一个查看分布的函数,再做出新特征时,可以快速查看新的特征在违约用户和非违约用户中的分布情况
def kde_target(var_name, df):
# Calculate the correlation coefficient between the new variable and the target
corr = df['TARGET'].corr(df[var_name])
# Calculate medians for repaid vs not repaid
avg_repaid = df.loc[df['TARGET'] == 0, var_name].median()
avg_not_repaid = df.loc[df['TARGET'] == 1, var_name].median()
plt.figure(figsize = (12, 6))
# Plot the distribution for target == 0 and target == 1
sns.kdeplot(df.loc[df['TARGET'] == 0, var_name], label = 'TARGET == 0')
sns.kdeplot(df.loc[df['TARGET'] == 1, var_name], label = 'TARGET == 1')
# label the plot
plt.xlabel(var_name); plt.ylabel('Density'); plt.title('%s Distribution' % var_name)
plt.legend();
# print out the correlation
print('The correlation between %s and the TARGET is %0.4f' % (var_name, corr))
# Print out average values
print('Median value for loan that was not repaid = %0.4f' % avg_not_repaid)
print('Median value for loan that was repaid = %0.4f' % avg_repaid)
kde_target('previous_loan_counts', app_train)
The correlation between previous_loan_counts and the TARGET is -0.0100 Median value for loan that was not repaid = 3.0000 Median value for loan that was repaid = 4.0000
对于连续型变量,采用计算它们的统计值来作为特征
bureau_agg = bureau.drop(columns = ['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
bureau_agg.head()
| SK_ID_CURR | DAYS_CREDIT | CREDIT_DAY_OVERDUE | ... | DAYS_CREDIT_UPDATE | AMT_ANNUITY | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | max | min | sum | count | mean | max | min | ... | count | mean | max | min | sum | count | mean | max | min | sum | ||
| 0 | 100001 | 7 | -735.000000 | -49 | -1572 | -5145 | 7 | 0.0 | 0 | 0 | ... | 7 | -93.142857 | -6 | -155 | -652 | 7 | 3545.357143 | 10822.5 | 0.0 | 24817.5 |
| 1 | 100002 | 8 | -874.000000 | -103 | -1437 | -6992 | 8 | 0.0 | 0 | 0 | ... | 8 | -499.875000 | -7 | -1185 | -3999 | 7 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 2 | 100003 | 4 | -1400.750000 | -606 | -2586 | -5603 | 4 | 0.0 | 0 | 0 | ... | 4 | -816.000000 | -43 | -2131 | -3264 | 0 | NaN | NaN | NaN | 0.0 |
| 3 | 100004 | 2 | -867.000000 | -408 | -1326 | -1734 | 2 | 0.0 | 0 | 0 | ... | 2 | -532.000000 | -382 | -682 | -1064 | 0 | NaN | NaN | NaN | 0.0 |
| 4 | 100005 | 3 | -190.666667 | -62 | -373 | -572 | 3 | 0.0 | 0 | 0 | ... | 3 | -54.333333 | -11 | -121 | -163 | 3 | 1420.500000 | 4261.5 | 0.0 | 4261.5 |
5 rows × 61 columns
对每一列重新命名
columns = ['SK_ID_CURR']
# Iterate through the variables names
for var in bureau_agg.columns.levels[0]:
# Skip the id name
if var != 'SK_ID_CURR':
# Iterate through the stat names
for stat in bureau_agg.columns.levels[1][:-1]:
# Make a new column name for the variable and stat
columns.append('bureau_%s_%s' % (var, stat))
bureau_agg.columns = columns
bureau_agg.head()
| SK_ID_CURR | bureau_DAYS_CREDIT_count | bureau_DAYS_CREDIT_mean | bureau_DAYS_CREDIT_max | bureau_DAYS_CREDIT_min | bureau_DAYS_CREDIT_sum | bureau_CREDIT_DAY_OVERDUE_count | bureau_CREDIT_DAY_OVERDUE_mean | bureau_CREDIT_DAY_OVERDUE_max | bureau_CREDIT_DAY_OVERDUE_min | ... | bureau_DAYS_CREDIT_UPDATE_count | bureau_DAYS_CREDIT_UPDATE_mean | bureau_DAYS_CREDIT_UPDATE_max | bureau_DAYS_CREDIT_UPDATE_min | bureau_DAYS_CREDIT_UPDATE_sum | bureau_AMT_ANNUITY_count | bureau_AMT_ANNUITY_mean | bureau_AMT_ANNUITY_max | bureau_AMT_ANNUITY_min | bureau_AMT_ANNUITY_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100001 | 7 | -735.000000 | -49 | -1572 | -5145 | 7 | 0.0 | 0 | 0 | ... | 7 | -93.142857 | -6 | -155 | -652 | 7 | 3545.357143 | 10822.5 | 0.0 | 24817.5 |
| 1 | 100002 | 8 | -874.000000 | -103 | -1437 | -6992 | 8 | 0.0 | 0 | 0 | ... | 8 | -499.875000 | -7 | -1185 | -3999 | 7 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 2 | 100003 | 4 | -1400.750000 | -606 | -2586 | -5603 | 4 | 0.0 | 0 | 0 | ... | 4 | -816.000000 | -43 | -2131 | -3264 | 0 | NaN | NaN | NaN | 0.0 |
| 3 | 100004 | 2 | -867.000000 | -408 | -1326 | -1734 | 2 | 0.0 | 0 | 0 | ... | 2 | -532.000000 | -382 | -682 | -1064 | 0 | NaN | NaN | NaN | 0.0 |
| 4 | 100005 | 3 | -190.666667 | -62 | -373 | -572 | 3 | 0.0 | 0 | 0 | ... | 3 | -54.333333 | -11 | -121 | -163 | 3 | 1420.500000 | 4261.5 | 0.0 | 4261.5 |
5 rows × 61 columns
同样把新制作的特征和主数据集进行left join
app_train = app_train.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
app_train.head()
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | bureau_DAYS_CREDIT_UPDATE_count | bureau_DAYS_CREDIT_UPDATE_mean | bureau_DAYS_CREDIT_UPDATE_max | bureau_DAYS_CREDIT_UPDATE_min | bureau_DAYS_CREDIT_UPDATE_sum | bureau_AMT_ANNUITY_count | bureau_AMT_ANNUITY_mean | bureau_AMT_ANNUITY_max | bureau_AMT_ANNUITY_min | bureau_AMT_ANNUITY_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 8.0 | -499.875 | -7.0 | -1185.0 | -3999.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 4.0 | -816.000 | -43.0 | -2131.0 | -3264.0 | 0.0 | NaN | NaN | NaN | 0.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 2.0 | -532.000 | -382.0 | -682.0 | -1064.0 | 0.0 | NaN | NaN | NaN | 0.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 1.0 | -783.000 | -783.0 | -783.0 | -783.0 | 0.0 | NaN | NaN | NaN | 0.0 |
5 rows × 183 columns
上面一口气制作了大量统计特征,同样,需要去考察一下它们对模型预测的能力,如果再像之前一个个的去查看分布的话会比较麻烦,可以查看这些特征和Y值的相关性系数来做一个快速的判断
# List of new correlations
new_corrs = []
# Iterate through the columns
for col in columns:
# Calculate correlation with the target
corr = app_train['TARGET'].corr(app_train[col])
# Append the list as a tuple
new_corrs.append((col, corr))
new_corrs = sorted(new_corrs, key = lambda x: abs(x[1]), reverse = True)
new_corrs[:15]
[('bureau_DAYS_CREDIT_mean', 0.08972896721998985),
('bureau_DAYS_CREDIT_min', 0.07524825103011096),
('bureau_DAYS_CREDIT_UPDATE_mean', 0.06892735266969323),
('bureau_DAYS_ENDDATE_FACT_min', 0.05588737984391145),
('bureau_DAYS_CREDIT_ENDDATE_sum', 0.05373489560102582),
('bureau_DAYS_ENDDATE_FACT_mean', 0.05319962585757731),
('bureau_DAYS_CREDIT_max', 0.049782054639978035),
('bureau_DAYS_ENDDATE_FACT_sum', 0.04885350261111991),
('bureau_DAYS_CREDIT_ENDDATE_mean', 0.04698275433484625),
('bureau_DAYS_CREDIT_UPDATE_min', 0.04286392247073441),
('bureau_DAYS_CREDIT_sum', 0.0419998248148508),
('bureau_DAYS_CREDIT_UPDATE_sum', 0.04140363535306417),
('bureau_DAYS_CREDIT_ENDDATE_max', 0.036589634696337574),
('bureau_DAYS_CREDIT_ENDDATE_min', 0.03428110992162412),
('bureau_DAYS_ENDDATE_FACT_count', -0.03049230665332852)]
输出相关性绝对值前15的特征,可以看到DAYS_CREDIT_MEAN与Y值的正相关性最强,即申请人在信用局开户的平均历史天数,因为数据集中这个值是负数,所以含义其实是用户的开户时间越长,历史信用记录的时间越久越不容易违约
kde_target('bureau_DAYS_CREDIT_mean', app_train)
The correlation between bureau_DAYS_CREDIT_mean and the TARGET is 0.0897 Median value for loan that was not repaid = -835.3333 Median value for loan that was repaid = -1067.0000
有一定区分度
为了能快速把上面计算连续型变量特征的方法应用到其他数据集中,定义一个函数来完成上面所有的步骤
def agg_numeric(df, group_var, df_name):
# Remove id variables other than grouping variable
for col in df:
if col != group_var and 'SK_ID' in col:
df = df.drop(columns = col)
group_ids = df[group_var]
numeric_df = df.select_dtypes('number')
numeric_df[group_var] = group_ids
# Group by the specified variable and calculate the statistics
agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
# Need to create new column names
columns = [group_var]
# Iterate through the variables names
for var in agg.columns.levels[0]:
# Skip the grouping variable
if var != group_var:
# Iterate through the stat names
for stat in agg.columns.levels[1][:-1]:
# Make a new column name for the variable and stat
columns.append('%s_%s_%s' % (df_name, var, stat))
agg.columns = columns
return agg
bureau_agg_new = agg_numeric(bureau.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg_new.head()
| SK_ID_CURR | bureau_DAYS_CREDIT_count | bureau_DAYS_CREDIT_mean | bureau_DAYS_CREDIT_max | bureau_DAYS_CREDIT_min | bureau_DAYS_CREDIT_sum | bureau_CREDIT_DAY_OVERDUE_count | bureau_CREDIT_DAY_OVERDUE_mean | bureau_CREDIT_DAY_OVERDUE_max | bureau_CREDIT_DAY_OVERDUE_min | ... | bureau_DAYS_CREDIT_UPDATE_count | bureau_DAYS_CREDIT_UPDATE_mean | bureau_DAYS_CREDIT_UPDATE_max | bureau_DAYS_CREDIT_UPDATE_min | bureau_DAYS_CREDIT_UPDATE_sum | bureau_AMT_ANNUITY_count | bureau_AMT_ANNUITY_mean | bureau_AMT_ANNUITY_max | bureau_AMT_ANNUITY_min | bureau_AMT_ANNUITY_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100001 | 7 | -735.000000 | -49 | -1572 | -5145 | 7 | 0.0 | 0 | 0 | ... | 7 | -93.142857 | -6 | -155 | -652 | 7 | 3545.357143 | 10822.5 | 0.0 | 24817.5 |
| 1 | 100002 | 8 | -874.000000 | -103 | -1437 | -6992 | 8 | 0.0 | 0 | 0 | ... | 8 | -499.875000 | -7 | -1185 | -3999 | 7 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 2 | 100003 | 4 | -1400.750000 | -606 | -2586 | -5603 | 4 | 0.0 | 0 | 0 | ... | 4 | -816.000000 | -43 | -2131 | -3264 | 0 | NaN | NaN | NaN | 0.0 |
| 3 | 100004 | 2 | -867.000000 | -408 | -1326 | -1734 | 2 | 0.0 | 0 | 0 | ... | 2 | -532.000000 | -382 | -682 | -1064 | 0 | NaN | NaN | NaN | 0.0 |
| 4 | 100005 | 3 | -190.666667 | -62 | -373 | -572 | 3 | 0.0 | 0 | 0 | ... | 3 | -54.333333 | -11 | -121 | -163 | 3 | 1420.500000 | 4261.5 | 0.0 | 4261.5 |
5 rows × 61 columns
同样再定义一个相关性计算函数
def target_corrs(df):
# List of correlations
corrs = []
# Iterate through the columns
for col in df.columns:
print(col)
# Skip the target column
if col != 'TARGET':
# Calculate correlation with the target
corr = df['TARGET'].corr(df[col])
# Append the list as a tuple
corrs.append((col, corr))
# Sort by absolute magnitude of correlations
corrs = sorted(corrs, key = lambda x: abs(x[1]), reverse = True)
return corrs
首先先把数据集中的离散特征变成哑变量
categorical = pd.get_dummies(bureau.select_dtypes('object'))
categorical['SK_ID_CURR'] = bureau['SK_ID_CURR']
categorical.head()
| CREDIT_ACTIVE_Active | CREDIT_ACTIVE_Bad debt | CREDIT_ACTIVE_Closed | CREDIT_ACTIVE_Sold | CREDIT_CURRENCY_currency 1 | CREDIT_CURRENCY_currency 2 | CREDIT_CURRENCY_currency 3 | CREDIT_CURRENCY_currency 4 | CREDIT_TYPE_Another type of loan | CREDIT_TYPE_Car loan | ... | CREDIT_TYPE_Loan for business development | CREDIT_TYPE_Loan for purchase of shares (margin lending) | CREDIT_TYPE_Loan for the purchase of equipment | CREDIT_TYPE_Loan for working capital replenishment | CREDIT_TYPE_Microloan | CREDIT_TYPE_Mobile operator loan | CREDIT_TYPE_Mortgage | CREDIT_TYPE_Real estate loan | CREDIT_TYPE_Unknown type of loan | SK_ID_CURR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 215354 |
| 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 215354 |
| 2 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 215354 |
| 3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 215354 |
| 4 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 215354 |
5 rows × 24 columns
然后求总数和均值
categorical_grouped = categorical.groupby('SK_ID_CURR').agg(['sum', 'mean'])
categorical_grouped.head()
| CREDIT_ACTIVE_Active | CREDIT_ACTIVE_Bad debt | CREDIT_ACTIVE_Closed | CREDIT_ACTIVE_Sold | CREDIT_CURRENCY_currency 1 | ... | CREDIT_TYPE_Microloan | CREDIT_TYPE_Mobile operator loan | CREDIT_TYPE_Mortgage | CREDIT_TYPE_Real estate loan | CREDIT_TYPE_Unknown type of loan | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sum | mean | sum | mean | sum | mean | sum | mean | sum | mean | ... | sum | mean | sum | mean | sum | mean | sum | mean | sum | mean | |
| SK_ID_CURR | |||||||||||||||||||||
| 100001 | 3 | 0.428571 | 0 | 0.0 | 4 | 0.571429 | 0 | 0.0 | 7 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100002 | 2 | 0.250000 | 0 | 0.0 | 6 | 0.750000 | 0 | 0.0 | 8 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100003 | 1 | 0.250000 | 0 | 0.0 | 3 | 0.750000 | 0 | 0.0 | 4 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100004 | 0 | 0.000000 | 0 | 0.0 | 2 | 1.000000 | 0 | 0.0 | 2 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100005 | 2 | 0.666667 | 0 | 0.0 | 1 | 0.333333 | 0 | 0.0 | 3 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
5 rows × 46 columns
重新更改一下列名
group_var = 'SK_ID_CURR'
# Need to create new column names
columns = []
# Iterate through the variables names
for var in categorical_grouped.columns.levels[0]:
# Skip the grouping variable
if var != group_var:
# Iterate through the stat names
for stat in ['count', 'count_norm']:
# Make a new column name for the variable and stat
columns.append('%s_%s' % (var, stat))
# Rename the columns
categorical_grouped.columns = columns
categorical_grouped.head()
| CREDIT_ACTIVE_Active_count | CREDIT_ACTIVE_Active_count_norm | CREDIT_ACTIVE_Bad debt_count | CREDIT_ACTIVE_Bad debt_count_norm | CREDIT_ACTIVE_Closed_count | CREDIT_ACTIVE_Closed_count_norm | CREDIT_ACTIVE_Sold_count | CREDIT_ACTIVE_Sold_count_norm | CREDIT_CURRENCY_currency 1_count | CREDIT_CURRENCY_currency 1_count_norm | ... | CREDIT_TYPE_Microloan_count | CREDIT_TYPE_Microloan_count_norm | CREDIT_TYPE_Mobile operator loan_count | CREDIT_TYPE_Mobile operator loan_count_norm | CREDIT_TYPE_Mortgage_count | CREDIT_TYPE_Mortgage_count_norm | CREDIT_TYPE_Real estate loan_count | CREDIT_TYPE_Real estate loan_count_norm | CREDIT_TYPE_Unknown type of loan_count | CREDIT_TYPE_Unknown type of loan_count_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||
| 100001 | 3 | 0.428571 | 0 | 0.0 | 4 | 0.571429 | 0 | 0.0 | 7 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100002 | 2 | 0.250000 | 0 | 0.0 | 6 | 0.750000 | 0 | 0.0 | 8 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100003 | 1 | 0.250000 | 0 | 0.0 | 3 | 0.750000 | 0 | 0.0 | 4 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100004 | 0 | 0.000000 | 0 | 0.0 | 2 | 1.000000 | 0 | 0.0 | 2 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100005 | 2 | 0.666667 | 0 | 0.0 | 1 | 0.333333 | 0 | 0.0 | 3 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
5 rows × 46 columns
做好的特征可以合并到主训练集中
app_train = app_train.merge(categorical_grouped, left_on = 'SK_ID_CURR', right_index = True, how = 'left')
app_train.head()
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | CREDIT_TYPE_Microloan_count | CREDIT_TYPE_Microloan_count_norm | CREDIT_TYPE_Mobile operator loan_count | CREDIT_TYPE_Mobile operator loan_count_norm | CREDIT_TYPE_Mortgage_count | CREDIT_TYPE_Mortgage_count_norm | CREDIT_TYPE_Real estate loan_count | CREDIT_TYPE_Real estate loan_count_norm | CREDIT_TYPE_Unknown type of loan_count | CREDIT_TYPE_Unknown type of loan_count_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 229 columns
对离散型变量的特征提取步骤定义为一个函数
def count_categorical(df, group_var, df_name):
# Select the categorical columns
categorical = pd.get_dummies(df.select_dtypes('object'))
# Make sure to put the identifying id on the column
categorical[group_var] = df[group_var]
# Groupby the group var and calculate the sum and mean
categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
column_names = []
# Iterate through the columns in level 0
for var in categorical.columns.levels[0]:
# Iterate through the stats in level 1
for stat in ['count', 'count_norm']:
# Make a new column name
column_names.append('%s_%s_%s' % (df_name, var, stat))
categorical.columns = column_names
return categorical
bureau_counts = count_categorical(bureau, group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_counts.head()
| bureau_CREDIT_ACTIVE_Active_count | bureau_CREDIT_ACTIVE_Active_count_norm | bureau_CREDIT_ACTIVE_Bad debt_count | bureau_CREDIT_ACTIVE_Bad debt_count_norm | bureau_CREDIT_ACTIVE_Closed_count | bureau_CREDIT_ACTIVE_Closed_count_norm | bureau_CREDIT_ACTIVE_Sold_count | bureau_CREDIT_ACTIVE_Sold_count_norm | bureau_CREDIT_CURRENCY_currency 1_count | bureau_CREDIT_CURRENCY_currency 1_count_norm | ... | bureau_CREDIT_TYPE_Microloan_count | bureau_CREDIT_TYPE_Microloan_count_norm | bureau_CREDIT_TYPE_Mobile operator loan_count | bureau_CREDIT_TYPE_Mobile operator loan_count_norm | bureau_CREDIT_TYPE_Mortgage_count | bureau_CREDIT_TYPE_Mortgage_count_norm | bureau_CREDIT_TYPE_Real estate loan_count | bureau_CREDIT_TYPE_Real estate loan_count_norm | bureau_CREDIT_TYPE_Unknown type of loan_count | bureau_CREDIT_TYPE_Unknown type of loan_count_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||
| 100001 | 3 | 0.428571 | 0 | 0.0 | 4 | 0.571429 | 0 | 0.0 | 7 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100002 | 2 | 0.250000 | 0 | 0.0 | 6 | 0.750000 | 0 | 0.0 | 8 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100003 | 1 | 0.250000 | 0 | 0.0 | 3 | 0.750000 | 0 | 0.0 | 4 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100004 | 0 | 0.000000 | 0 | 0.0 | 2 | 1.000000 | 0 | 0.0 | 2 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
| 100005 | 2 | 0.666667 | 0 | 0.0 | 1 | 0.333333 | 0 | 0.0 | 3 | 1.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 |
5 rows × 46 columns
重新读取一遍数据集,把数据集还原到初始状态
app_train=pd.read_csv(r'C:\Users\Qingguo Wu\Desktop\Loan_risk_project\信用风险数据集\application_train.csv')
app_test = pd.read_csv(r'C:\Users\Qingguo Wu\Desktop\Loan_risk_project\信用风险数据集\application_test.csv')
bureau = pd.read_csv(r'C:\Users\Qingguo Wu\Desktop\Loan_risk_project\信用风险数据集\bureau.csv')
previous_application = pd.read_csv(r'C:\Users\Qingguo Wu\Desktop\Loan_risk_project\信用风险数据集\previous_application.csv')
把之前对主训练集做的特征重新加入到数据集
app_train['CREDIT_INCOME_PERCENT'] = app_train['AMT_CREDIT'] / app_train['AMT_INCOME_TOTAL']
app_train['ANNUITY_INCOME_PERCENT'] = app_train['AMT_ANNUITY'] / app_train['AMT_INCOME_TOTAL']
app_train['CREDIT_TERM'] = app_train['AMT_ANNUITY'] / app_train['AMT_CREDIT']
app_train['DAYS_EMPLOYED_PERCENT'] = app_train['DAYS_EMPLOYED'] / app_train['DAYS_BIRTH']
app_train['INCOME_PER_CHILD'] = app_train['AMT_INCOME_TOTAL'] / app_train['CNT_CHILDREN']
app_train['HAS_HOUSE_INFORMATION'] = app_train['COMMONAREA_MEDI'].apply(lambda x:1 if x>0 else 0)
app_test['CREDIT_INCOME_PERCENT'] = app_test['AMT_CREDIT'] / app_test['AMT_INCOME_TOTAL']
app_test['ANNUITY_INCOME_PERCENT'] = app_test['AMT_ANNUITY'] / app_test['AMT_INCOME_TOTAL']
app_test['CREDIT_TERM'] = app_test['AMT_ANNUITY'] / app_test['AMT_CREDIT']
app_test['DAYS_EMPLOYED_PERCENT'] = app_test['DAYS_EMPLOYED'] / app_test['DAYS_BIRTH']
app_test['INCOME_PER_CHILD'] = app_test['AMT_INCOME_TOTAL'] / app_test['CNT_CHILDREN']
app_test['HAS_HOUSE_INFORMATION'] = app_test['COMMONAREA_MEDI'].apply(lambda x:1 if x>0 else 0)
两个函数完成之前对信用局数据中连续变量和离散变量的特征提取
bureau_counts = count_categorical(bureau, group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg_new = agg_numeric(bureau.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg_new.head()
| SK_ID_CURR | bureau_DAYS_CREDIT_count | bureau_DAYS_CREDIT_mean | bureau_DAYS_CREDIT_max | bureau_DAYS_CREDIT_min | bureau_DAYS_CREDIT_sum | bureau_CREDIT_DAY_OVERDUE_count | bureau_CREDIT_DAY_OVERDUE_mean | bureau_CREDIT_DAY_OVERDUE_max | bureau_CREDIT_DAY_OVERDUE_min | ... | bureau_DAYS_CREDIT_UPDATE_count | bureau_DAYS_CREDIT_UPDATE_mean | bureau_DAYS_CREDIT_UPDATE_max | bureau_DAYS_CREDIT_UPDATE_min | bureau_DAYS_CREDIT_UPDATE_sum | bureau_AMT_ANNUITY_count | bureau_AMT_ANNUITY_mean | bureau_AMT_ANNUITY_max | bureau_AMT_ANNUITY_min | bureau_AMT_ANNUITY_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100001 | 7 | -735.000000 | -49 | -1572 | -5145 | 7 | 0.0 | 0 | 0 | ... | 7 | -93.142857 | -6 | -155 | -652 | 7 | 3545.357143 | 10822.5 | 0.0 | 24817.5 |
| 1 | 100002 | 8 | -874.000000 | -103 | -1437 | -6992 | 8 | 0.0 | 0 | 0 | ... | 8 | -499.875000 | -7 | -1185 | -3999 | 7 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 2 | 100003 | 4 | -1400.750000 | -606 | -2586 | -5603 | 4 | 0.0 | 0 | 0 | ... | 4 | -816.000000 | -43 | -2131 | -3264 | 0 | NaN | NaN | NaN | 0.0 |
| 3 | 100004 | 2 | -867.000000 | -408 | -1326 | -1734 | 2 | 0.0 | 0 | 0 | ... | 2 | -532.000000 | -382 | -682 | -1064 | 0 | NaN | NaN | NaN | 0.0 |
| 4 | 100005 | 3 | -190.666667 | -62 | -373 | -572 | 3 | 0.0 | 0 | 0 | ... | 3 | -54.333333 | -11 | -121 | -163 | 3 | 1420.500000 | 4261.5 | 0.0 | 4261.5 |
5 rows × 61 columns
给训练集和测试集增加信用局相关特征
app_train = app_train.merge(bureau_counts, on = 'SK_ID_CURR', how = 'left')
app_train = app_train.merge(bureau_agg_new, on = 'SK_ID_CURR', how = 'left')
app_test = app_test.merge(bureau_counts, on = 'SK_ID_CURR', how = 'left')
app_test = app_test.merge(bureau_agg_new, on = 'SK_ID_CURR', how = 'left')
print(app_train.shape)
print(app_test.shape)
(307511, 615) (48744, 614)
排除一些具有共线性的特征以提高模型的效果, 计算变量与变量之间的相关系数,来快速移除一些相关性过高的变量,定义一个阈值是0.8
corrs = app_train.corr()
# Set the threshold
threshold = 0.8
# Empty dictionary to hold correlated variables
above_threshold_vars = {}
# For each column, record the variables that are above the threshold
for col in corrs:
above_threshold_vars[col] = list(corrs.index[corrs[col] > threshold])
# Track columns to remove and columns already examined
cols_to_remove = []
cols_seen = []
cols_to_remove_pair = []
# Iterate through columns and correlated columns
for key, value in above_threshold_vars.items():
# Keep track of columns already examined
cols_seen.append(key)
for x in value:
if x == key:
next
else:
# Only want to remove one in a pair
if x not in cols_seen:
cols_to_remove.append(x)
cols_to_remove_pair.append(key)
cols_to_remove = list(set(cols_to_remove))
print('Number of columns to remove: ', len(cols_to_remove))
Number of columns to remove: 189
一共要移除189列具有高相关性的变量
train_corrs_removed = app_train.drop(columns = cols_to_remove)
test_corrs_removed = app_test.drop(columns = cols_to_remove)
print('Training Corrs Removed Shape: ', train_corrs_removed.shape)
print('Testing Corrs Removed Shape: ', test_corrs_removed.shape)
Training Corrs Removed Shape: (307511, 426) Testing Corrs Removed Shape: (48744, 425)
训练集和测试集都移除对应的列,把列数降低到426
submission, fi, metrics = model(train_corrs_removed, test_corrs_removed)
print('metrics')
print(metrics)
Training Data Shape: (307511, 545)
Testing Data Shape: (48744, 545)
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.823433 train's binary_logloss: 0.522866 valid's auc: 0.777219 valid's binary_logloss: 0.540798
[400] train's auc: 0.85854 train's binary_logloss: 0.484692 valid's auc: 0.778442 valid's binary_logloss: 0.516702
Early stopping, best iteration is:
[362] train's auc: 0.852718 train's binary_logloss: 0.491023 valid's auc: 0.778943 valid's binary_logloss: 0.520737
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.823834 train's binary_logloss: 0.522443 valid's auc: 0.77811 valid's binary_logloss: 0.540163
[400] train's auc: 0.859698 train's binary_logloss: 0.48315 valid's auc: 0.778282 valid's binary_logloss: 0.516133
Early stopping, best iteration is:
[358] train's auc: 0.853153 train's binary_logloss: 0.490494 valid's auc: 0.778864 valid's binary_logloss: 0.520621
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.82312 train's binary_logloss: 0.523464 valid's auc: 0.779867 valid's binary_logloss: 0.542628
[400] train's auc: 0.859015 train's binary_logloss: 0.484814 valid's auc: 0.781773 valid's binary_logloss: 0.517528
Early stopping, best iteration is:
[397] train's auc: 0.858536 train's binary_logloss: 0.485352 valid's auc: 0.781831 valid's binary_logloss: 0.517898
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.823076 train's binary_logloss: 0.523263 valid's auc: 0.779038 valid's binary_logloss: 0.540561
[400] train's auc: 0.858485 train's binary_logloss: 0.484973 valid's auc: 0.779057 valid's binary_logloss: 0.51687
Early stopping, best iteration is:
[317] train's auc: 0.845335 train's binary_logloss: 0.499209 valid's auc: 0.779816 valid's binary_logloss: 0.525713
Training until validation scores don't improve for 100 rounds
[200] train's auc: 0.82383 train's binary_logloss: 0.522341 valid's auc: 0.775657 valid's binary_logloss: 0.542691
[400] train's auc: 0.859321 train's binary_logloss: 0.483644 valid's auc: 0.777071 valid's binary_logloss: 0.518796
Early stopping, best iteration is:
[314] train's auc: 0.845584 train's binary_logloss: 0.498844 valid's auc: 0.77733 valid's binary_logloss: 0.52802
metrics
fold train valid
0 0 0.852718 0.778943
1 1 0.853153 0.778864
2 2 0.858536 0.781831
3 3 0.845335 0.779816
4 4 0.845584 0.777330
5 overall 0.851065 0.779357
fi_sorted = plot_feature_importances(fi)